Initialisation de l’environnement

rm(list=ls())

Chargement des bibliothèques

# install.packages("tidytext")
library("tidytext")
# install.packages("tidyverse")
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
# install.packages("igraph")
library("igraph")
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
# install.packages("reshape2")
library("reshape2")
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
# install.packages("wordcloud")
library("wordcloud")
## Loading required package: RColorBrewer
# install.packages("ggraph")
library("ggraph")
# install.packages("widyr")
library("widyr")

Chargement des données

data <- readLines("Frank Herbert - Dune.txt_clean")
head(data)
## [1] "= = = = = = "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
## [2] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [3] "A beginning is the time for taking the most delicate care that the balances are correct. This every sister of the Bene Gesserit knows. To begin your study of the life of Muad'Dib, then, take care that you first place him in his time: born in the 57th year of the Padishah Emperor, Shaddam IV. And take the most special care that you locate Muad'Dib in his place: the planet Arrakis. Do not be deceived by the fact that he was born on Caladan and lived his first fifteen years there. Arrakis, the planet known as Dune, is forever his place."
## [4] "-from \"Manual of Muad'Dib\" by the Princess Irulan"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
## [5] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [6] "    In the week before their departure to Arrakis, when all the final scurrying about had reached a nearly unbearable frenzy, an old crone came to visit the mother of the boy, Paul."

Ajout colonne “line”

data_split <- data_frame(line = 1:length(data), text = data)
head(data_split)

Ajouter colonne “chpaitre” pour chaque ligne

#data_split[data_split$line == "= = = = = =",]
data_split$chapitre <- cumsum(as.integer(data_split$text=="= = = = = ="))+1
dim(data_split)
## [1] 8048    3
head(data_split)

Transformation du jeu de données pour faire apparaître le numéro des chapitres

#le dataframe qui ne contient pas de ligne '= = = = = = "
data_chapter <- data_split[-which(data_split$text=="= = = = = ="),]
#nombre de ligne pour chaque ligne
table(data_chapter$chapitre)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 173 125 111 148 157 158 121  54 135  76 235  86  49 379 289 216  65 175 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##  51 210 211   4  73 204 222 213 249 124 158  95 177 122 193 216 297 148 
##  37  38  39  40  41  42  43  44  45  46  47  48 
## 247   4 143 116 341 121 277 214 159 155 136 368

En observant la table de fréquence, chapitre 22 et 38 ne contient que 4 lignes.

data_chapter[which(data_chapter$chapitre %in% c(22,38)),]

En fait, ce sont des lignes qui séparent les différentes grandes parties du livre. Dune contient 3 grandes parties : Book 1 , Book 2 et Book 3.

Ajouter les numéros de parties pour chaque ligne

#récupérer les noms de chqpitres qui ne contiennent que 4 lignes
sep_partie <- as.integer(names(which(table(data_chapter$chapitre)==4)))
#ajouter la colonne "partie"
data_chapter$partie <- cumsum(as.integer(data_chapter$chapitre %in% sep_partie))+1
#ajuster la colonne "chapitre" car elle va être décalé en raison de la suppression des chapitres 22 et 38  
data_chapter <- data_chapter[-which(data_chapter$chapitre %in% sep_partie),]
# data_clean : 46 facteurs
# data_brut : 51 facteurs
data_chapter$chapitre = as.factor(data_chapter$chapitre)
levels(data_chapter$chapitre) = as.factor(1:51)
data_chapter$partie = as.factor(data_chapter$partie)
levels(data_chapter$partie) = as.factor(1:3)

message("Nombre de ligne par chapitre")
## Nombre de ligne par chapitre
table(data_chapter$chapitre)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 173 125 111 148 157 158 121  54 135  76 235  86  49 379 289 216  65 175 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##  51 210 211  73 204 222 213 249 124 158  95 177 122 193 216 297 148 247 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51 
## 143 116 341 121 277 214 159 155 136 368   0   0   0   0   0
message("Nombre de ligne par partie")
## Nombre de ligne par partie
table(data_chapter$partie)
## 
##    1    2    3 
## 3224 2738 2030
head(data_chapter)

Enlever les lignes “= = = = = =” et vide

data_chapter <- data_chapter[-which(data_chapter$text %in% c("= = = = = =","")),]
data_chapter <- data_chapter[-1,]
data_chapter$line <- 1:nrow(data_chapter)
head(data_chapter)

Sentiment par parties

data_part <- data_chapter %>% 
    unnest_tokens(word, text) 
# set factor to keep books in order of publication
data_part$partie <- factor(data_part$partie, levels = rev(1:3))
data_part
data_part %>%
  group_by(partie) %>% 
  mutate(word_count = 1:n(),
         index = word_count %/% 500 + 1) %>%
  inner_join(get_sentiments("bing")) %>%
  count(partie, index = index , sentiment) %>%
  ungroup() %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative,
         partie = factor(partie, levels = 1:3)) %>%
  ggplot(aes(index, sentiment, fill = partie)) +
  geom_bar(alpha = 0.5, stat = "identity", show.legend = FALSE) +
  facet_wrap(~ partie, ncol = 2, scales = "free_x")
## Joining, by = "word"

Sentiment per phrase

phrase <- c()

#stock les phrases
for (i in data_chapter$text){
  phrase <- c(phrase, unlist(str_split(i, "\\. \"|\\.\"|[.][:space:]+(?=[:upper:])")))
}

phrase <- gsub("\\\"|[.]","",phrase)

df_sentence <- data.frame(line = 1:length(phrase),sentence = phrase, stringsAsFactors = FALSE)

head(df_sentence)
# install.packages("sentimentr")
library(sentimentr)

senti <- c()
for (line in df_sentence$sentence) {
    tmp <- get_sentences(line)
    for(i in 1:length(tmp[[1]])) {
        senti_tmp <- tmp[[1]][i]
        senti <- c(senti, senti_tmp)
    }
}

df_sentr <- data.frame(senti, stringsAsFactors = FALSE)
df_sentr$senti <- as.character(df_sentr$senti)
sentiment <- sentiment(df_sentr$senti)
## Warning: Each time `sentiment` is run it has to do sentence boundary disambiguation when a
## raw `character` vector is passed to `text.var`. This may be costly of time and
## memory.  It is highly recommended that the user first runs the raw `character`
## vector through the `get_sentences` function.
df_sentr$sentiment <- as.numeric(sentiment$sentiment)
df_sentr$pntag <- ifelse(sentiment$sentiment == 0, 'Neutral',
                         ifelse(sentiment$sentiment > 0, 'Positive',
                                ifelse(sentiment$sentiment < 0, 'Negative', 'NA')))
# base R plot
plot(df_sentr$sentiment, type='l', pch=3)

# plotly- more fun
ax <- list(
    title = "Phrase",
    zeroline = FALSE,
    showline = FALSE,
    showticklabels = FALSE
)

# install.packages('plotly')
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:sentimentr':
## 
##     highlight
## The following object is masked from 'package:igraph':
## 
##     groups
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
plot_ly(data = df_sentr, x = ~senti, y = ~sentiment, color = ~pntag,
        type = 'scatter', mode = 'markers') %>% layout(xaxis = ax)

Tutoriel avec tidytext

data <- data[-which(data_split$text %in% c("= = = = = =",""))]
data <- data[-1]
df_initial <- data_frame(line = 1:length(data), text = data)
head(df_initial)

Tutoriel tidy text

Dataframe avec les tokens en ligne (sans stopwords)

df <- df_initial %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining, by = "word"
head(df)

Les mots les plus fréquents dans le livre

df %>%
  count(word, sort = TRUE) %>%
  filter(n > 300) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab("Tokens") +
  coord_flip()

Trier les mots les plus fréquents

df %>%
  count(word, sort = TRUE)

Negative et positive mots

df %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray10", "gray50"),
                   max.words = 200, scale=c(3,.20))
## Joining, by = "word"

TF-IDF

freq_by_rank <- df_initial %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE) %>%
  ungroup() %>%
  mutate(rank = row_number(),
         total = sum(n),
         'term frequency' = n / total)

dim(freq_by_rank)
## [1] 11475     5
head(freq_by_rank)
rank_subset <- freq_by_rank %>%
  filter(rank < 500,
         rank > 10)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##     -0.8387      -1.0192
freq_by_rank %>%
  ggplot(aes(rank, `term frequency`)) + 
  geom_abline(intercept = -0.837, slope = -1.0192, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

freq_by_rank %>%
  select(-total) %>%
  arrange(desc(`term frequency`))

n-gram

bigrams2 <- df_initial %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  na.omit()%>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  count(word1, word2, sort = TRUE)

trigrams3 <- df_initial %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  na.omit()%>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)

quadrigrams4 <- df_initial %>%
  unnest_tokens(word, text, token = "ngrams", n = 4) %>%
  separate(word, c("word1", "word2", "word3", "word4"), sep = " ") %>%
  na.omit()%>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word3 %in% stop_words$word) %>%
  filter(!word4 %in% stop_words$word) %>%
  count(word1, word2, word3, word4, sort = TRUE)

quintigrams5 <- df_initial %>%
  unnest_tokens(word, text, token = "ngrams", n = 5) %>%
  separate(word, c("word1", "word2", "word3", "word4", "word5"), sep = " ") %>%
  na.omit()%>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word3 %in% stop_words$word) %>%
  filter(!word4 %in% stop_words$word) %>%
  filter(!word5 %in% stop_words$word) %>%
  count(word1, word2, word3, word4, word5, sort = TRUE)

head(bigrams2)
head(trigrams3)
head(quadrigrams4)
head(quintigrams5)
bigrams2_filtered <- bigrams2 %>%
  unite(bigram, word1, word2, sep = " ")

trigrams3_filtered <- trigrams3 %>%
  unite(bigram, word1, word2, word3, sep = " ")

quadrigrams4_filtered <- quadrigrams4 %>%
  unite(bigram, word1, word2, word3, word4, sep = " ")

quintigrams5_filtered <- quintigrams5 %>%
  unite(bigram, word1, word2, word3, word4, word5, sep = " ")
bigrams2_filtered
trigrams3_filtered
quadrigrams4_filtered
quintigrams5_filtered
bigram_graph <- bigrams2 %>%
  filter(n > 10) %>%
  graph_from_data_frame()

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

bigram_graph <- bigrams2 %>%
  filter(n > 20) %>%
  graph_from_data_frame()

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

section_words <- df_initial %>%
  mutate(section = row_number() %/% 10) %>%
  filter(section > 0) %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word)
word_pairs <- section_words %>%
  pairwise_count(word, section, sort = TRUE)
word_pairs
word_cors <- section_words %>%
  group_by(word) %>%
  filter(n() >= 50) %>%
  pairwise_cor(word, section, sort = TRUE)

dim(word_cors)
## [1] 47306     3
word_cors %>%
  filter(correlation > .20) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

CleanNLP pour annoter les entités

# # Load library
# library(cleanNLP)
# 
# library(reticulate)
# 
# # Setting up NLP backend
# cnlp_init_spacy()
# 
# # Get text book 1
# book1 <- paste(data_chapter$text[which(data_chapter$partie==1)], collapse = " ")
# 
# obj1 <- cnlp_annotate(book1, as_strings = TRUE)
# 
# head(obj1)
# 
# # Find the named entities in our text
# people <- cnlp_get_entity(obj1) %>% 
#   filter(entity_type == "PERSON") %>%
#   group_by(entity) %>%
#   count %>%
#   arrange(desc(n))
# 
# # Show the top 20 characters by mention
# people

Stereotpe de chaque groupe de personnage

# DATA PREPARATION ####
group_data <- c("House Atreides","House Harkonnen","Bene Gesserit","Bene Tleilax","Spacing Guild",
 "Honored Matres" ,"Fremen","Miscellaneous"  )

regex_groupe <- paste(group_data, collapse = "|") # regular expression

df_sentence <- df_sentence[-which(df_sentence$sentence %in% c(""," ")),]
df_sentence$line <- 1:nrow(df_sentence)
# LOAD IN BOOK TEXT 
groupes_sentences <- df_sentence %>%
  filter(grepl(regex_groupe, sentence)) %>% # exclude sentences without house reference
  cbind(sapply(group_data, function(x) grepl(x, .$sentence)))# identify references
# examine
max.char = 35 # define max sentence length
groupes_sentences %>%
  mutate(sentence = ifelse(nchar(sentence) > max.char, # cut off long sentences
                           paste0(substring(sentence, 1, max.char), "..."),
                           sentence)) %>% 
  head(5)
# custom capitalization function
Capitalize <- function(text){ 
  paste0(substring(text,1,1) %>% toupper(),
         substring(text,2))
}

# TO LONG FORMAT
groupe_long <- groupes_sentences %>%
  gather(key = group_data, value = test, -sentence, -line) %>% 
  mutate(group = group_data) %>% # capitalize names
  filter(test) %>% select(-c(group_data,test)) # delete rows where house not referenced
# examine
groupe_long %>%
  mutate(sentence = ifelse(nchar(sentence) > max.char, # cut off long sentences
                           paste0(substring(sentence, 1, max.char), "..."),
                           sentence)) %>% 
  head(20)
# set plot width & height
w = 19; h = 6  

# PLOT REFERENCE FREQUENCY
groupe_long %>%
  group_by(group) %>%
  summarize(n = n()) %>% # count sentences per house
  ggplot(aes(x = desc(group), y = n)) +
  geom_bar(aes(fill = group), stat = 'identity')+
  geom_text(aes(y = n+50, label = group),position = position_dodge(0.9),
    vjust = 0) +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.position = 'none') + 
  coord_flip()

Mot par groupe de personnage

# IDENTIFY WORDS USED IN COMBINATION WITH HOUSES
words_by_groupes <- groupe_long %>% 
  unnest_tokens(word, sentence, token = 'words') %>% # retrieve words
  mutate(word = gsub("'s", "", word)) %>% # remove possesive determiners
  group_by(group, word) %>% 
  summarize(word_n = n()) # count words per house
# examine
words_by_groupes %>% head()
# custom functions for reordering facet plots
# https://github.com/dgrtwo/drlib/blob/master/R/reorder_within.R
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
  new_x <- paste(x, within, sep = sep)
  reorder(new_x, by, FUN = fun)
}

scale_x_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}

# set plot width & height
w = 10; h = 7; 

# PLOT MOST FREQUENT WORDS PER HOUSE
words_per_groupe = 20 # set number of top words
words_by_groupes %>%
  group_by(group) %>%
  arrange(group, desc(word_n)) %>%
  mutate(top = row_number()) %>% # count word top position
  filter(top <= words_per_groupe) %>% # retain specified top number
  ggplot(aes(reorder_within(word, -top, group), # reorder by minus top number
             word_n, fill = group)) +
  geom_col(show.legend = F) +
  scale_x_reordered() +
  facet_wrap(~ group, scales = "free_y") + # facet wrap and free y axis
  coord_flip()

sans stopwords

words_by_groupes <- words_by_groupes %>%
  group_by(word) %>% mutate(word_sum = sum(word_n)) %>% # counts words overall
  group_by(group) %>% mutate(group_n = n()) %>%
  ungroup() %>%
    # compute ratio of usage in combination with house as opposed to overall
  # adjusted for house references frequency as opposed to overall frequency
  mutate(ratio = (word_n / (word_sum - word_n + 1) / (group_n / n()))) 
# examine
words_by_groupes %>% select(-word_sum, -group_n) %>% arrange(desc(word_n)) %>% head()
words_by_groupes %>%
  group_by(group) %>%
  arrange(group, desc(ratio)) %>%
  mutate(top = row_number()) %>% # count word top position
  filter(top <= words_per_groupe) %>% # retain specified top number
  ggplot(aes(reorder_within(word, -top, group), # reorder by minus top number
             ratio, fill = group)) +
  geom_col(show.legend = F) +
  scale_x_reordered() + 
  facet_wrap(~ group, scales = "free") +  # facet wrap and free scales
  coord_flip()